Убираем индивидов с пропусками, убираем информацию о моделях и производителе, оставляем машины с четным количеством цилиндров (остальных слишком мало), делаем соответствующие столбцы факторами.
library(readxl)
library(dplyr)
library(tidyr)
df <- read_excel("CARDATA.xls") %>%
drop_na() %>%
filter(CYLINDER %in% c(4, 6, 8)) %>%
select(-MAKE, -MODEL) %>%
mutate(ROW = NULL, CYLINDER = as.factor(CYLINDER),
YEAR = as.factor(YEAR), ORIGIN = as.factor(ORIGIN))
head(df)
## # A tibble: 6 × 9
## MPG CYLINDER DISPLACE HORSEPOW ACCEL YEAR WEIGHT ORIGIN PRICE
## <dbl> <fct> <dbl> <dbl> <dbl> <fct> <dbl> <fct> <dbl>
## 1 43 4 90 48 22 78 1985 2 2400
## 2 36 4 98 66 14 78 1800 1 1900
## 3 33 4 78 52 19 78 1985 3 2200
## 4 39 4 85 70 19 78 2070 3 2725
## 5 36 4 91 60 16 78 1800 3 2250
## 6 20 8 260 110 16 78 3365 1 3300
ORIGIN: 1 - США, 2 - Европа, 3 - Япония. Посмотрим на корреляции.
library(ggplot2)
library(GGally)
df %>%ggpairs(diag=list(continuous = "barDiag"),
columns = c("MPG", "DISPLACE", "HORSEPOW", "ACCEL", "WEIGHT", "PRICE"))
df %>%ggpairs(diag=list(continuous = "barDiag"), aes(colour = CYLINDER), legend = 1,
columns = c("MPG", "DISPLACE", "HORSEPOW", "ACCEL", "WEIGHT", "PRICE"))
df %>%ggpairs(diag=list(continuous = "barDiag"), aes(colour = YEAR), legend = 1,
columns = c("MPG", "DISPLACE", "HORSEPOW", "ACCEL", "WEIGHT", "PRICE"))
df %>%ggpairs(diag=list(continuous = "barDiag"), aes(colour = ORIGIN), legend = 1,
columns = c("MPG", "DISPLACE", "HORSEPOW", "ACCEL", "WEIGHT", "PRICE"))
Посмотрим на корреляции логарифмированных данных.
dfLog <- df %>% mutate(MPG = log(MPG), DISPLACE = log(DISPLACE),
HORSEPOW = log(HORSEPOW), ACCEL = log(ACCEL),
WEIGHT = log(WEIGHT), PRICE = log(PRICE))
dfLog %>%ggpairs(diag=list(continuous = "barDiag"),
columns = c("MPG", "DISPLACE", "HORSEPOW", "ACCEL", "WEIGHT", "PRICE"))
dfLog %>%ggpairs(diag=list(continuous = "barDiag"), aes(colour = CYLINDER), legend = 1,
columns = c("MPG", "DISPLACE", "HORSEPOW", "ACCEL", "WEIGHT", "PRICE"))
dfLog %>%ggpairs(diag=list(continuous = "barDiag"), aes(colour = YEAR), legend = 1,
columns = c("MPG", "DISPLACE", "HORSEPOW", "ACCEL", "WEIGHT", "PRICE"))
dfLog %>%ggpairs(diag=list(continuous = "barDiag"), aes(colour = ORIGIN), legend = 1,
columns = c("MPG", "DISPLACE", "HORSEPOW", "ACCEL", "WEIGHT", "PRICE"))
Сравним корреляции данных до и после логарифмирования.
library(reshape)
ggplot(melt(cor(select(df, -CYLINDER, -YEAR, -ORIGIN), method = "pearson")), aes(X1, X2)) +
geom_raster(aes(fill = value)) +
geom_text(aes(label = round(value, 1))) +
scale_fill_gradient2(low=colors()[143], mid='white', high=colors()[639]) +
ggtitle("pearson")
ggplot(melt(cor(select(dfLog, -CYLINDER, -YEAR, -ORIGIN), method = "pearson")), aes(X1, X2)) +
geom_raster(aes(fill = value)) +
geom_text(aes(label = round(value, 1))) +
scale_fill_gradient2(low=colors()[143], mid='white', high=colors()[639]) +
ggtitle("pearson log")
ggplot(melt(cor(select(df, -CYLINDER, -YEAR, -ORIGIN), method = "spearman")), aes(X1, X2)) +
geom_raster(aes(fill = value)) +
geom_text(aes(label = round(value, 1))) +
scale_fill_gradient2(low=colors()[143], mid='white', high=colors()[639]) +
ggtitle("spearman")
ggplot(melt(cor(select(dfLog, -CYLINDER, -YEAR, -ORIGIN), method = "spearman")), aes(X1, X2)) +
geom_raster(aes(fill = value)) +
geom_text(aes(label = round(value, 1))) +
scale_fill_gradient2(low=colors()[143], mid='white', high=colors()[639]) +
ggtitle("spearman log")
ggplot(melt(cor(select(df, -CYLINDER, -YEAR, -ORIGIN), method = "kendall")), aes(X1, X2)) +
geom_raster(aes(fill = value)) +
geom_text(aes(label = round(value, 1))) +
scale_fill_gradient2(low=colors()[143], mid='white', high=colors()[639]) +
ggtitle("kendall")
ggplot(melt(cor(select(dfLog, -CYLINDER, -YEAR, -ORIGIN), method = "kendall")), aes(X1, X2)) +
geom_raster(aes(fill = value)) +
geom_text(aes(label = round(value, 1))) +
scale_fill_gradient2(low=colors()[143], mid='white', high=colors()[639]) +
ggtitle("kendall log")
Можно заметить, что монотонные преобразования не влияют на Спирмана и
Кендала и что Кендал меньше по модулю Спирмана и Кендала.
Посмотрим на наличие аутлайнеров.
library(ggpubr)
# dfLog[1,9] <- 100
MPG <- dfLog %>% ggplot(aes(y=MPG)) + geom_boxplot()
DISPLACE <- dfLog %>% ggplot(aes(y=DISPLACE)) + geom_boxplot()
HORSEPOW <- dfLog %>% ggplot(aes(y=HORSEPOW)) + geom_boxplot()
ACCEL <- dfLog %>% ggplot(aes(y=ACCEL)) + geom_boxplot()
WEIGHT <- dfLog %>% ggplot(aes(y=WEIGHT)) + geom_boxplot()
PRICE <- dfLog %>% ggplot(aes(y=PRICE)) + geom_boxplot()
ggarrange(MPG, DISPLACE, HORSEPOW, ACCEL, WEIGHT, PRICE,
ncol = 3, nrow = 2)
Уберем выбросы и посмотрим, как это влияет на коэффициенты
корреляции.
dfLogOutlinersOff <- dfLog
Q1 <- quantile(dfLogOutlinersOff$ACCEL, probs=c(.25, .75))
iqr1 <- IQR(dfLogOutlinersOff$ACCEL)
up1 <- Q1[2]+1.5*iqr1
low1 <- Q1[1]-1.5*iqr1
Q2 <- quantile(dfLogOutlinersOff$PRICE, probs=c(.25, .75))
iqr2 <- IQR(dfLogOutlinersOff$PRICE)
up2 <- Q2[2]+1.5*iqr2
low2 <- Q2[1]-1.5*iqr2
tmp1 <-length(dfLogOutlinersOff$MPG)
dfLogOutlinersOff <- dfLogOutlinersOff %>% filter(ACCEL > low1, ACCEL < up1,
PRICE > low2, PRICE < up2)
tmp2 <- length(dfLogOutlinersOff$MPG)
tmp1 - tmp2
## [1] 7
ggplot(melt(cor(select(dfLog, -CYLINDER, -YEAR, -ORIGIN), method = "pearson")), aes(X1, X2)) +
geom_raster(aes(fill = value)) +
geom_text(aes(label = round(value, 1))) +
scale_fill_gradient2(low=colors()[143], mid='white', high=colors()[639]) +
ggtitle("pearson log")
ggplot(melt(cor(select(dfLogOutlinersOff, -CYLINDER, -YEAR, -ORIGIN), method = "pearson")), aes(X1, X2)) +
geom_raster(aes(fill = value)) +
geom_text(aes(label = round(value, 1))) +
scale_fill_gradient2(low=colors()[143], mid='white', high=colors()[639]) +
ggtitle("pearson log outlinersOff")
ggplot(melt(cor(select(dfLog, -CYLINDER, -YEAR, -ORIGIN), method = "spearman")), aes(X1, X2)) +
geom_raster(aes(fill = value)) +
geom_text(aes(label = round(value, 1))) +
scale_fill_gradient2(low=colors()[143], mid='white', high=colors()[639]) +
ggtitle("spearman log")
ggplot(melt(cor(select(dfLogOutlinersOff, -CYLINDER, -YEAR, -ORIGIN), method = "spearman")), aes(X1, X2)) +
geom_raster(aes(fill = value)) +
geom_text(aes(label = round(value, 1))) +
scale_fill_gradient2(low=colors()[143], mid='white', high=colors()[639]) +
ggtitle("spearman log outlinersOff")
ggplot(melt(cor(select(dfLog, -CYLINDER, -YEAR, -ORIGIN), method = "kendall")), aes(X1, X2)) +
geom_raster(aes(fill = value)) +
geom_text(aes(label = round(value, 1))) +
scale_fill_gradient2(low=colors()[143], mid='white', high=colors()[639]) +
ggtitle("kendall log")
ggplot(melt(cor(select(dfLogOutlinersOff, -CYLINDER, -YEAR, -ORIGIN), method = "kendall")), aes(X1, X2)) +
geom_raster(aes(fill = value)) +
geom_text(aes(label = round(value, 1))) +
scale_fill_gradient2(low=colors()[143], mid='white', high=colors()[639]) +
ggtitle("kendall log outlinersOff")
Если есть сильный выброс, то он сильно влияет на Пирсона.